Enter the directory of the maca folder on your drive and the name of the tissue you want to analyze.
tissue_of_interest = "Liver"
Load the requisite packages and some additional helper functions.
library(here)
library(useful)
library(Seurat)
library(dplyr)
library(Matrix)
library(ontologyIndex)
cell_ontology = get_ontology('https://raw.githubusercontent.com/obophenotype/cell-ontology/master/cl-basic.obo', extract_tags='everything')
validate_cell_ontology = function(cell_ontology_class){
in_cell_ontology = sapply(cell_ontology_class, function(x) is.element(x, cell_ontology$name) || is.na(x))
if (!all(in_cell_ontology)) {
message = paste0('"', cell_ontology_class[!in_cell_ontology], '" is not in the cell ontology
')
stop(message)
}
}
convert_to_cell_ontology_id = function(cell_ontology_class){
return(sapply(cell_ontology_class, function(x) as.vector(cell_ontology$id[cell_ontology$name == x])[1]))
}
save_dir = here('00_data_ingest', '04_tissue_robj_generated')
# read the metadata to get the plates we want
droplet_metadata_filename = here('00_data_ingest', '01_droplet_raw_data', 'metadata_droplet.csv')
droplet_metadata <- read.csv(droplet_metadata_filename, sep=",", header = TRUE)
colnames(droplet_metadata)[1] <- "channel"
droplet_metadata
Subset the metadata on the tissue.
tissue_metadata = filter(droplet_metadata, tissue == tissue_of_interest)[,c('channel','tissue','subtissue','mouse.sex')]
tissue_metadata
Use only the metadata rows corresponding to Bladder plates. Make a plate barcode dataframe to “expand” the per-plate metadata to be per-cell.
# Load the gene names and set the metadata columns by opening the first file
subfolder = paste0(tissue_of_interest, '-', tissue_metadata$channel[1])
raw.data <- Read10X(data.dir = here('00_data_ingest', '01_droplet_raw_data', 'droplet', subfolder))
colnames(raw.data) <- lapply(colnames(raw.data), function(x) paste0(tissue_metadata$channel[1], '_', x))
meta.data = data.frame(row.names = colnames(raw.data))
meta.data['channel'] = tissue_metadata$channel[1]
if (length(tissue_metadata$channel) > 1){
# Some tissues, like Thymus and Heart had only one channel
for(i in 2:nrow(tissue_metadata)){
subfolder = paste0(tissue_of_interest, '-', tissue_metadata$channel[i])
new.data <- Read10X(data.dir = here('00_data_ingest', '01_droplet_raw_data', 'droplet', subfolder))
colnames(new.data) <- lapply(colnames(new.data), function(x) paste0(tissue_metadata$channel[i], '_', x))
new.metadata = data.frame(row.names = colnames(new.data))
new.metadata['channel'] = tissue_metadata$channel[i]
raw.data = cbind(raw.data, new.data)
meta.data = rbind(meta.data, new.metadata)
}
}
rnames = row.names(meta.data)
meta.data <- merge(meta.data, tissue_metadata, sort = F)
row.names(meta.data) <- rnames
dim(raw.data)
[1] 23433 1924
corner(raw.data)
[1] 0 0 0 0 2
head(meta.data)
Order the cells alphabetically to ensure consistency.
ordered_cell_names = order(colnames(raw.data))
raw.data = raw.data[,ordered_cell_names]
meta.data = meta.data[ordered_cell_names,]
corner(raw.data)
[1] 0 0 0 0 2
head(meta.data)
Process the raw data and load it into the Seurat object.
# Find ERCC's, compute the percent ERCC, and drop them from the raw data.
erccs <- grep(pattern = "^ERCC-", x = rownames(x = raw.data), value = TRUE)
percent.ercc <- Matrix::colSums(raw.data[erccs, ])/Matrix::colSums(raw.data)
ercc.index <- grep(pattern = "^ERCC-", x = rownames(x = raw.data), value = FALSE)
raw.data <- raw.data[-ercc.index,]
# Create the Seurat object with all the data
tiss <- CreateSeuratObject(raw.data = raw.data, project = tissue_of_interest,
min.cells = 1, min.genes = 0)
# Continue from here onwards !
tiss <- AddMetaData(object = tiss, meta.data)
tiss <- AddMetaData(object = tiss, percent.ercc, col.name = "percent.ercc")
# Change default name for sums of counts from nUMI to nReads
# colnames(tiss@meta.data)[colnames(tiss@meta.data) == 'nUMI'] <- 'nReads'
# Create metadata columns for cell_ontology_classs and subcell_ontology_classs
tiss@meta.data[,'cell_ontology_class'] <- NA
tiss@meta.data[,'subcell_ontology_class'] <- NA
Calculate percent ribosomal genes.
ribo.genes <- grep(pattern = "^Rp[sl][[:digit:]]", x = rownames(x = tiss@data), value = TRUE)
percent.ribo <- Matrix::colSums(tiss@raw.data[ribo.genes, ])/Matrix::colSums(tiss@raw.data)
tiss <- AddMetaData(object = tiss, metadata = percent.ribo, col.name = "percent.ribo")
A sanity check: genes per cell vs reads per cell.
GenePlot(object = tiss, gene1 = "nUMI", gene2 = "nGene", use.raw=T)
Filter out cells with few reads and few genes.
tiss <- FilterCells(object = tiss, subset.names = c("nGene", "nUMI"), low.thresholds = c(500, 1000))
Normalize the data, then center and scale.
tiss <- NormalizeData(object = tiss, scale.factor = 1e4)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
|----|----|----|----|----|----|----|----|----|----|
**************************************************|
tiss <- ScaleData(object = tiss)
[1] "Scaling data matrix"
|
| | 0%
|
|=====================================================================================================================================| 100%
tiss <- FindVariableGenes(object = tiss, do.plot = TRUE, x.high.cutoff = Inf, y.cutoff = 0.5)
Calculating gene means
0% 10 20 30 40 50 60 70 80 90 100%
|----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variance to mean ratios
0% 10 20 30 40 50 60 70 80 90 100%
|----|----|----|----|----|----|----|----|----|----|
**************************************************|
Run Principal Component Analysis.
tiss <- RunPCA(object = tiss, do.print = FALSE)
tiss <- ProjectPCA(object = tiss, do.print = FALSE)
Later on (in FindClusters and TSNE) you will pick a number of principal components to use. This has the effect of keeping the major directions of variation in the data and, ideally, supressing noise. There is no correct answer to the number to use, but a decent rule of thumb is to go until the plot plateaus.
PCElbowPlot(object = tiss)
Choose the number of principal components to use.
# Set number of principal components.
n.pcs = 10
The clustering is performed based on a nearest neighbors graph. Cells that have similar expression will be joined together. The Louvain algorithm looks for groups of cells with high modularity–more connections within the group than between groups. The resolution parameter determines the scale. Higher resolution will give more clusters, lower resolution will give fewer.
For the top-level clustering, aim to under-cluster instead of over-cluster. It will be easy to subset groups and further analyze them below.
# Set resolution
res.used <- 3.5
tiss <- FindClusters(object = tiss, reduction.type = "pca", dims.use = 1:n.pcs,
resolution = res.used, print.output = 0, save.SNN = TRUE)
We use TSNE solely to visualize the data.
# If cells are too spread out, you can raise the perplexity. If you have few cells, try a lower perplexity (but never less than 10).
tiss <- RunTSNE(object = tiss, dims.use = 1:n.pcs, seed.use = 10, perplexity=30)
TSNEPlot(object = tiss, do.label = T, pt.size = 1.2, label.size = 4)
TSNEPlot(tiss, group.by="mouse.sex")
Significant genes:
hepatocyte: Alb, Ttr, Apoa1, and Serpina1c pericentral: Cyp2e1, Glul, Oat, Gulo midlobular: Ass1, Hamp, Gstp1, Ubb periportal: Cyp2f2, Pck1, Hal, Cdh1
endothelial cells: Pecam1, Nrp1, Kdr+ and Oit3+ Kuppfer cells: Emr1, Clec4f, Cd68, Irf7 NK/NKT cells: Zap70, Il2rb, Nkg7, Cxcr6, Klr1c, Gzma B cells: Cd79a, Cd79b, Cd74 and Cd19 Immune cells: Ptprc
genes_hep = c('Alb', 'Ttr', 'Apoa1', 'Serpina1c',
'Cyp2e1', 'Glul', 'Oat', 'Gulo',
'Ass1', 'Hamp', 'Gstp1', 'Ubb',
'Cyp2f2', 'Pck1', 'Hal', 'Cdh1')
genes_endo = c('Pecam1', 'Nrp1', 'Kdr','Oit3')
genes_kuppfer = c('Emr1', 'Clec4f', 'Cd68', 'Irf7')
genes_nk = c('Zap70', 'Il2rb', 'Nkg7', 'Cxcr6', 'Gzma')
genes_b = c('Cd79a', 'Cd79b', 'Cd74')
genes_bec = c('Epcam', 'Krt19', 'Krt7')
genes_immune = 'Ptprc'
all_genes = c(genes_hep, genes_endo, genes_kuppfer, genes_nk, genes_b, genes_bec, genes_immune)
Dotplots let you see the intensity of exppression and the fraction of cells expressing for each of your genes of interest. The radius shows you the percent of cells in that cluster with at least one read sequenced from that gene. The color level indicates the average Z-score of gene expression for cells in that cluster, where the scaling is done over taken over all cells in the sample.
Using the markers above, we can confidentaly label many of the clusters:
19: endothelial cells 20: bile duct epithelial cells 21: immune cells rest are hepatocytes
We will add those cell_ontology_classes to the dataset.
tiss <- StashIdent(object = tiss, save.name = "cluster.ids")
cluster.ids <- c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21)
free_annotation <- c(
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"endothelial cells",
"bile duct epithelial cells",
"immune cells")
cell_ontology_class <- c(
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"hepatocyte",
"endothelial cell of hepatic sinusoid",
"duct epithelial cell",
"leukocyte")
validate_cell_ontology(cell_ontology_class)
cell_ontology_id = convert_to_cell_ontology_id(cell_ontology_class)
tiss@meta.data['free_cell_ontology_class'] <- as.character(plyr::mapvalues(x = tiss@ident, from = cluster.ids, to = free_cell_ontology_class))
validate_cell_ontology(cell_ontology_class)
cell_ontology_id = convert_to_cell_ontology_id(cell_ontology_class)
tiss@meta.data['free_annotation'] <- as.character(plyr::mapvalues(x = tiss@ident, from = cluster.ids, to = free_annotation))
tiss@meta.data['cell_ontology_id'] <- as.character(plyr::mapvalues(x = tiss@ident, from = cluster.ids, to = cell_ontology_id))
Color by metadata, like plate barcode, to check for batch effects.
Let’s drill down on the hepatocytes.
subtiss <- subtiss %>% ScaleData() %>%
FindVariableGenes(do.plot = TRUE, x.high.cutoff = Inf, y.cutoff = 0.5) %>%
RunPCA(do.print = FALSE)
[1] "Scaling data matrix"
|
| | 0%
|
|=====================================================================================================================================| 100%
Calculating gene means
0% 10 20 30 40 50 60 70 80 90 100%
|----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variance to mean ratios
0% 10 20 30 40 50 60 70 80 90 100%
|----|----|----|----|----|----|----|----|----|----|
**************************************************|
PCHeatmap(object = subtiss, pc.use = 1:3, cells.use = 20, do.balanced = TRUE, label.columns = FALSE, num.genes = 8)
PCElbowPlot(subtiss)
sub.n.pcs = 8
sub.res.use = 0.5
subtiss <- subtiss %>% FindClusters(reduction.type = "pca", dims.use = 1:sub.n.pcs,
resolution = sub.res.use, print.output = 0, save.SNN = TRUE) %>%
RunTSNE(dims.use = 1:sub.n.pcs, seed.use = 10, perplexity=8)
TSNEPlot(object = subtiss, do.label = T, pt.size = .5, label.size = 4)